In [ ]:
"""U.S. ESG FEATURES SAMPLE"""

'''SAMPLE SMALLER AS ESG DATA BEGINS END OF 2001

ESG SAMPLE - 
26,062 observations 
768 firms

14,592 from developed nations - 17 countries 
6,119 from developing nations - 19 countries

NON-ESG SAMPLE
85,553 observations
1858 firms'''
In [60]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use('bmh')

os.chdir('/Users/lockiemichalski/Documents/Files_Mac_before_clean/Desktop/UQ/RQ3_2020/Credit_rating_WRDS/Steps_sample/Files to upload rand/US_data/Draft2_data')
In [61]:
data = pd.read_csv('US_SAMPLE_ESG_ONLY.csv').iloc[:,1:]
/Users/lockiemichalski/opt/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3071: DtypeWarning: Columns (247,288,289,294,398,399,400,401,402,403) have mixed types.Specify dtype option on import or set low_memory=False.
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
In [4]:
data.sort_values(by='rating_rank', inplace=True)
In [67]:
print(str(len(data)) + ' observations')
print(str(len(data['Instrument'].unique())) + ' unique firms')
26062 observations
768 unique firms
In [6]:
"WITHOUT FINANCIALS"

plt.figure(figsize = (20, 10), dpi = 200)
sns.countplot(x='Issuer Rating', data=data).set_title('All-classes')
# class C is not in graph as there are only 2 observations and for classifiers to run this has to removed. 
Out[6]:
Text(0.5, 1.0, 'All-classes')
In [31]:
"Function to plot the missing values"

def missing_value_plot(df,title):
    null_counts = df.isnull().sum()/len(df)
    null_counts = null_counts.sort_values()
    plt.figure(figsize=(16,8))
    plt.xticks(np.arange(len(null_counts))+0.5,null_counts.index,rotation='vertical')
    plt.ylabel('fraction of rows with missing data (%)')
    plt.bar(np.arange(len(null_counts)),null_counts)
    plt.title(title)
    plt.xlabel('Features')
In [32]:
financial_features = ['accrualq', 'aftret_eqq', 'aftret_equityq', 'aftret_invcapxq', 'at_turnq', 
                      'bmq', 'capeiq', 'capital_ratioq', 'cash_conversionq', 'cash_debtq', 'cash_ltq',
                      'cash_ratioq', 'cfmq', 'curr_debtq', 'curr_ratioq', 'debt_atq', 'de_ratioq', 
                      'debt_capitalq', 'debt_ebitdaq', 'debt_invcapq', 'divyieldq', 'dltt_beq', 
                      'dprq', 'efftaxq', 'equity_invcapq', 'evmq', 'fcf_ocfq', 'gpmq', 'GProfq', 
                      'int_debtq', 'int_totdebtq', 'intcov_ratioq', 'intcovq', 'inv_turnq', 
                      'invt_actq', 'lt_atq', 'lt_debtq', 'lt_ppentq', 'mktcap', 'npmq', 'ocf_lctq', 
                      'opmadq', 'opmbdq', 'pay_turnq', 'pcfq', 'pe_exiq', 'pe_incq', 'pe_op_basicq',
                      'pe_op_dilq', 'PEG_1yrforward', 'PEG_ltgforward', 'PEG_trailing', 'pretret_earnatq',
                      'pretret_noq', 'price_adj', 'price_unadj', 'profit_lctq', 'psq', 'ptbq', 'ptpmq',
                      'quick_ratioq', 'rd_saleq', 'rect_actq', 'rect_turnq', 'roaq', 'roceq', 'roeq',
                      'sale_equityq', 'sale_invcapq', 'sale_nwcq', 'short_debtq', 'totdebt_invcapq',
                      'gind', 'ggroup', 'gsector']
missing_value_plot(data[financial_features], 'Financial ratios - without gsector 40') # financial ratios missing data + 3 categorical industry 
In [9]:
"""Financial ratios: divyield should be filled with 0 as these would be firms with no dividend, 
therefore median not correct to use. This is probable for some of the debt ratios too as some firms might not actually 
have any debt leading to such high missing values. rd_saleq needs to go. Similarly, PE ratio types exhibit
lots of missing data"""
categorical_ind = ['gind', 'ggroup', 'gsector']
In [33]:
missing_value_plot(data[['Rec_Total','Rec_Median','Rec_Low','Rec_High','Rec_SBuy','Rec_Buy','Rec_Hold',
 'Rec_Sell','Rec_SSell','Rec_NoOpinion','LTG_Mean','Num_Analyst']], 'Analyst recommendations')

'''NOTE: Once a firm has a recommendation, all the columns have values. This missing data is prior to the 
firm having any recommendations. What should we do about this?'''
Out[33]:
'NOTE: Once a firm has a recommendation, all the columns have values. This missing data is prior to the \nfirm having any recommendations. What should we do about this?'
In [34]:
categorical_inst = ['Top10InstOwn','NumInstBlockOwners','InstBlockOwn','NumInstOwners',
 'MaxInstOwn','InstOwn','InstOwn_HHI','InstOwn_Perc']
missing_value_plot(data[categorical_inst], "Inst holdings")
In [35]:
macro_features = ['balance_on_trade_bop', 'business_confidence', 'cab%gdp', 'central_government_deficit', 
                          'consumer_confidence', 'consumer_spending', 'current_account_balance', 
                          'export_goods_bop', 'export_prices', 'exports_goods_services', 'foreign_trade_balance', 
                          'government_bond', 'government_external_debt', 'government_spending', 
                          'gross_fixed_capital_investment', 'import_prices', 'imports_goods_bop', 
                          'imports_goods_services', 'industrial_production', 'international_reserves',
                          'labour_force_survey', 'merchandise_exports', 'merchandise_imports', 'money_supply_M0', 
                          'money_supply_M1', 'money_supply_M2', 'overnight_rate', 'population', 
                          'retail_sales', 'sov_outlook_rank','stock_index', 'terms_of_trade', 
                          'unemployment_rate']
missing_value_plot(data[macro_features], 'Macro + Macro growth')
In [50]:
#non-missing df excluding ESG features with missing data - refer to sum stats GLOBAL csv file
''' 420 ESG features '''

'''Referred excel file Global sum stats '''
ESG_sum_stats = pd.read_csv('US_ESG_sum_stats.csv')

'''217 features have <5% missing values and 195 < 2%  missing values'''
#Split it up to four plots 
ESG_217_cols = list(ESG_sum_stats.iloc[:,1:218].columns)
list_ESG = np.array_split(ESG_217_cols, 4)
missing_value_plot(data[list(list_ESG[0])], "ESG first features")
In [51]:
missing_value_plot(data[list(list_ESG[1])], "ESG second features")
In [52]:
missing_value_plot(data[list(list_ESG[2])], "ESG third features")
In [53]:
missing_value_plot(data[list(list_ESG[3])], "ESG fourth features")
In [41]:
 missing_value_plot(data[list(ESG_sum_stats.iloc[:,219:300].columns)], "ESG part of remaining")
In [42]:
categorical_inst = ['Top10InstOwn','NumInstBlockOwners','InstBlockOwn','NumInstOwners',
 'MaxInstOwn','InstOwn','InstOwn_HHI','InstOwn_Perc']
categorical_macro = ['sov_rating_rank','sov_outlook_rank']
categorical_analyst = ['Rec_Total','Rec_Median','Rec_Low','Rec_High','Rec_SBuy','Rec_Buy','Rec_Hold',
 'Rec_Sell','Rec_SSell','Rec_NoOpinion','LTG_Mean','Num_Analyst']
categorical_ind = ['gind', 'ggroup', 'gsector']
In [43]:
categorical = categorical_ind + categorical_inst + categorical_analyst + categorical_macro
In [14]:
'''Box plot of inst categorical features'''

fig, ax = plt.subplots(3, 3, figsize=(30, 30))
for var, subplot in zip(categorical_inst, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [15]:
'''Box plot of analyst categorical features'''

fig, ax = plt.subplots(3, 3, figsize=(30, 30))
for var, subplot in zip(categorical_analyst, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [16]:
'''Box plot of industry categorical features'''

fig, ax = plt.subplots(1, 3, figsize=(20, 5))
for var, subplot in zip(categorical_ind, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [17]:
'''Box plot of financial ratio features - Capitalization'''

fin_capitalization=['capital_ratioq','equity_invcapq','debt_invcapq','totdebt_invcapq']

fig, ax = plt.subplots(2,2, figsize=(20, 10))
for var, subplot in zip(fin_capitalization, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
    
In [18]:
'''Box plot of financial ratio features - Efficiency'''

fin_efficiency=['at_turnq','inv_turnq','pay_turnq','rect_turnq','sale_equityq','sale_invcapq','sale_nwcq']

fig, ax = plt.subplots(4,2, figsize=(30, 30))
for var, subplot in zip(fin_efficiency, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [19]:
'''Box plot of financial ratio features - Financial Soundness'''

fin_finsoundness=['invt_actq','rect_actq','fcf_ocfq','ocf_lctq','cash_debtq',
                'cash_ltq','cfmq','short_debtq','profit_lctq',
                'curr_debtq','debt_ebitdaq','dltt_beq','int_debtq',
                'int_totdebtq','lt_debtq','lt_ppentq']

fig, ax = plt.subplots(8,2, figsize=(30, 30))
for var, subplot in zip(fin_finsoundness, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [20]:
'''Box plot of financial ratio features - Liquidity'''

fin_liquidity=['cash_ratioq','curr_ratioq','quick_ratioq','accrualq','rd_saleq']

fig, ax = plt.subplots(3,2, figsize=(30, 30))
for var, subplot in zip(fin_liquidity, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [21]:
'''Box plot of financial ratio features - Profitability''' #NO FINANCIALS

fin_profitability = ['efftaxq','GProfq','aftret_eqq','aftret_equityq','aftret_invcapxq','gpmq','npmq','opmadq',
'opmbdq','pretret_earnatq','pretret_noq','ptpmq','roaq','roceq','roeq']

fig, ax = plt.subplots(8,2, figsize=(30, 30))
for var, subplot in zip(fin_profitability, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [22]:
'''Box plot of financial ratio features - Profitability'''

fin_profitability = ['efftaxq','GProfq','aftret_eqq','aftret_equityq','aftret_invcapxq','gpmq','npmq','opmadq',
'opmbdq','pretret_earnatq','pretret_noq','ptpmq','roaq','roceq','roeq']

fig, ax = plt.subplots(8,2, figsize=(30, 30))
for var, subplot in zip(fin_profitability, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [23]:
'''Box plot of financial ratio features - Solvency'''

fin_solvency = ['de_ratioq','debt_atq','lt_atq','debt_capitalq','intcovq','intcov_ratioq']

fig, ax = plt.subplots(3,2, figsize=(30, 30))
for var, subplot in zip(fin_solvency, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [24]:
'''Box plot of financial ratio features - Valuation'''

fin_valuation = ['dprq','PEG_1yrforward','PEG_ltgforward','PEG_trailing','bmq','capeiq','divyieldq',
 'evmq','pcfq','pe_exiq','pe_incq','pe_op_basicq','pe_op_dilq','psq','ptbq']

fig, ax = plt.subplots(8,2, figsize=(30, 30))
for var, subplot in zip(fin_valuation, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [25]:
'''Box plot of financial ratio features - other'''

fin = ['mktcap','price_adj']

fig, ax = plt.subplots(1,3, figsize=(30, 10))
for var, subplot in zip(fin, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [26]:
'''Macro and macro_growth features'''
macro_features_growth = [col for col in data if col.endswith('%')]
In [27]:
'''Box plot of numeric macro features'''

fig, ax = plt.subplots(9, 4, figsize=(40, 30))
for var, subplot in zip(macro_features, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
    
In [97]:
'''Box plot of numeric macro growth features'''

fig, ax = plt.subplots(9, 4, figsize=(40, 30))
for var, subplot in zip(macro_features_growth, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
    
In [46]:
"""ESG first 67 features"""

fig, ax = plt.subplots(14, 4, figsize=(40, 50))
for var, subplot in zip(list(list_ESG[0]), ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [49]:
"""ESG second 50 features"""

fig, ax = plt.subplots(14, 4, figsize=(40, 50))
for var, subplot in zip(list(list_ESG[1]), ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [54]:
fig, ax = plt.subplots(14, 4, figsize=(40, 50))
for var, subplot in zip(list(list_ESG[2]), ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [55]:
fig, ax = plt.subplots(14, 4, figsize=(40, 50))
for var, subplot in zip(list(list_ESG[2]), ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [58]:
# get the distribution of a feature per class with Yeo-Johnson transformation

from scipy.stats import anderson
from scipy.stats import kstest
#from scipy.stats import zscore
from scipy.stats import yeojohnson

import scipy.stats as st

'''Interested in each distribution by each class'''
def get_best_distribution_groupby(data,rating):
    dist_names = ["norm", "lognorm","exponweib", "weibull_max", 
                  "weibull_min", "pareto", "genextreme","logistic"]
    dist_results = []
    params = {}
    for dist_name in dist_names:
        dist = getattr(st, dist_name)
        param = dist.fit(data)

        params[dist_name] = param
        # Applying the Kolmogorov-Smirnov test
        D, p = kstest(data, dist_name, args=param)
        dist_results.append((dist_name, p))

    # select the best fitted distribution
    best_dist, best_p = (max(dist_results, key=lambda item: item[1]))
    # store the name of the best fit and its p value

    print("Best fitting distribution " +str(rating)+": " +str(best_dist))
    return best_dist

def dist_by_class(class_col, feature_col, df):
    df = df[[class_col,feature_col]]
    unique_ir = df[class_col].unique()
    #rating_zscores = []
    rating_yeojohnson = []
    rating_dist = []
    for rating in unique_ir:
        data = df[df[class_col]==rating]
        #z = zscore(data[feature_col].dropna())
        xt, lmbda = yeojohnson(np.array(data[feature_col].dropna()))
        if len(xt) == 0:
            print('No obs')
        else:
            #rating_zscores.append(xt)
            rating_yeojohnson.append(xt)
            print(str(len(xt)) + " length without outliers")
            """For anderson:
            dist must be 'norm', 'expon', 'gumbel', 'extreme1' or 'logistic'."""
            and_darl = anderson(xt, 'norm') #and_darl
            #print("and_darl_test:  " + str(and_darl))   
            best_dist = get_best_distribution_groupby(xt,rating) #kstest
            rating_dist.append(best_dist)
            
    fig, ax = plt.subplots(5,5, 
                          figsize=(30,30))
    for rating,rating_dist,unique_rating,subplot in zip(rating_yeojohnson,rating_dist,
                                                        list(unique_ir), ax.flatten()):
        sns.distplot(rating, ax=subplot).set_title(str(unique_rating)+' '+str(rating_dist))
In [59]:
dist_by_class('Issuer Rating', 'ESG Combined Score', data)
224 length without outliers
/Users/lockiemichalski/opt/anaconda3/lib/python3.8/site-packages/scipy/stats/_continuous_distns.py:1677: RuntimeWarning: invalid value encountered in add
  logp = (np.log(a) + np.log(c) + sc.xlogy(a - 1.0, exm1c) +
/Users/lockiemichalski/opt/anaconda3/lib/python3.8/site-packages/scipy/stats/_distn_infrastructure.py:2429: RuntimeWarning: invalid value encountered in double_scalars
  Lhat = muhat - Shat*mu
/Users/lockiemichalski/opt/anaconda3/lib/python3.8/site-packages/scipy/stats/_continuous_distns.py:2666: RuntimeWarning: invalid value encountered in subtract
  -pex2+logpex2-logex2)
Best fitting distribution AAA: logistic
559 length without outliers
Best fitting distribution AA: exponweib
376 length without outliers
Best fitting distribution AA-: exponweib
1211 length without outliers
Best fitting distribution A+: norm
1447 length without outliers
Best fitting distribution A: lognorm
1997 length without outliers
Best fitting distribution A-: weibull_min
3242 length without outliers
Best fitting distribution BBB+: logistic
3950 length without outliers
Best fitting distribution BBB: weibull_min
3276 length without outliers
Best fitting distribution BBB-: genextreme
2153 length without outliers
Best fitting distribution BB+: weibull_min
1953 length without outliers
Best fitting distribution BB: weibull_max
2037 length without outliers
Best fitting distribution BB-: norm
1835 length without outliers
Best fitting distribution B+: lognorm
793 length without outliers
Best fitting distribution B: exponweib
258 length without outliers
Best fitting distribution B-: exponweib
166 length without outliers
Best fitting distribution CCC+: weibull_min
36 length without outliers
Best fitting distribution CCC: logistic
53 length without outliers
Best fitting distribution CCC-: genextreme
40 length without outliers
Best fitting distribution CC: weibull_min
26 length without outliers
Best fitting distribution D: genextreme
In [ ]: